#@title Requeriments
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import folium #needed for interactive map
from folium.plugins import HeatMap
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
Para una correcta participación en la competencia Data Thriatlon sigue este formato de notebook propuestó por el equipo de data de Platzi. Esto ayudará en la calificación de tu notebook.
Secciones del notebook:
*Para revisar las reglas de la competencia ve a este blogpost.
En los datos encontrarás información de las exportaciones e importaciones de Colombia de 1962 a 2017. Fueron extraídos del OEC: The Observatory of Economic Complexity.
Tu labor será desarrollar un análisis exploratorio para encontrar insights valiosos a partir de esos datos y otros que recopiles.
Para el desarrollo de tu análisis puedes resolver las siguientes preguntas:
También puedes resolver preguntas que a ti se te ocurran.
Para extraer información adicional a la proporcionada por Platzi puedes extraerla directamente de la fuente de datos original. El dataset cuenta con la información extraída del OEC: The Observatory of Economic Complexity, específicamente de la versión legacy la cual permite la descarga de archivos a partir de una URL y de la API expuesta por ellos.
Para tu análisis recolecta más datos de esta y otras fuentes que veas necesarias.
#@title Geographical Data import
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
# function to find the coordinate
# of a given city
def findGeocode(city):
# try and catch is used to overcome
# the exception thrown by geolocator
# using geocodertimedout
try:
# Specify the user_agent as your
# app name it should not be none
geolocator = Nominatim(user_agent="your_app_name")
return geolocator.geocode(city)
except GeocoderTimedOut:
return findGeocode(city)
Para la limpieza de datos puedes utilizar herramientas como Pandas y numpy con el fin de limpiar y estructurar todo tipo de datos nulos o vacíos que no sean necesarios para el análisis de los datos requeridos.
#@title Settings
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/data/Dataset_Data_Triathlon/'
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
#@title Main data sets Load
products = pd.read_csv(path+'products_sitc_rev2.csv', sep= '|')
print('products Data set have a dimension of ',products.shape,' records and columns respectively')
groups = pd.read_csv(path+'groups_sitc_rev2.csv', sep= '|')
print('groups Data set have a dimension of ',groups.shape,' records and columns respectively')
countrys = pd.read_csv(path+'country_names.csv', sep= '|')
print('countrys Data set have a dimension of ',countrys.shape,' records and columns respectively')
imports = pd.read_csv(path+'colombia_imports.csv', sep= '|')
print('imports Data set have a dimension of ',imports.shape,' records and columns respectively')
exports = pd.read_csv(path+'colombia_exports.csv', sep= '|')
print('exports Data set have a dimension of ', exports.shape, ' records and columns respectively')
products Data set have a dimension of (988, 4) records and columns respectively groups Data set have a dimension of (65, 3) records and columns respectively countrys Data set have a dimension of (263, 4) records and columns respectively imports Data set have a dimension of (775902, 7) records and columns respectively exports Data set have a dimension of (799406, 7) records and columns respectively
#@title Basic Dataset Info
print('Here we can see some basic info from the differents data sets')
print(' ')
print('Product')
print(' ')
products.info()
print(' ')
print('groups')
print(' ')
groups.info()
print(' ')
print('countrys')
print(' ')
countrys.info()
print(' ')
print('imports')
print(' ')
imports.info()
print(' ')
print('exports')
print(' ')
exports.info()
Here we can see some basic info from the differents data sets Product <class 'pandas.core.frame.DataFrame'> RangeIndex: 988 entries, 0 to 987 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 988 non-null int64 1 id 988 non-null object 2 sitc 988 non-null int64 3 name 988 non-null object dtypes: int64(2), object(2) memory usage: 31.0+ KB groups <class 'pandas.core.frame.DataFrame'> RangeIndex: 65 entries, 0 to 64 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 65 non-null int64 1 id 65 non-null int64 2 category 65 non-null object dtypes: int64(2), object(1) memory usage: 1.6+ KB countrys <class 'pandas.core.frame.DataFrame'> RangeIndex: 263 entries, 0 to 262 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 263 non-null int64 1 id 263 non-null object 2 id_3char 263 non-null object 3 name 263 non-null object dtypes: int64(1), object(3) memory usage: 8.3+ KB imports <class 'pandas.core.frame.DataFrame'> RangeIndex: 775902 entries, 0 to 775901 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 775902 non-null int64 1 year 775902 non-null int64 2 origin 775902 non-null object 3 dest 775902 non-null object 4 sitc4 775902 non-null int64 5 export_val 731962 non-null float64 6 import_val 719568 non-null float64 dtypes: float64(2), int64(3), object(2) memory usage: 41.4+ MB exports <class 'pandas.core.frame.DataFrame'> RangeIndex: 799406 entries, 0 to 799405 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 799406 non-null int64 1 year 799406 non-null int64 2 origin 799406 non-null object 3 dest 799406 non-null object 4 sitc4 799406 non-null int64 5 export_val 715852 non-null float64 6 import_val 756158 non-null float64 dtypes: float64(2), int64(3), object(2) memory usage: 42.7+ MB
In this first step we have the goal of make a full data set with all the fields available, so we can use the primary and outsiders keys given in each fields based on the transactional datasets as the exports and imports files
Let's go to fit this data with this steps
imports['Transaction'] = 'import'
imports
| Unnamed: 0 | year | origin | dest | sitc4 | export_val | import_val | Transaction | |
|---|---|---|---|---|---|---|---|---|
| 0 | 11564 | 1962 | civ | col | 712 | 13000.0 | 0.0 | import |
| 1 | 11565 | 1962 | civ | col | 1210 | 0.0 | 12000.0 | import |
| 2 | 17741 | 1962 | cod | col | 2925 | 2000.0 | 0.0 | import |
| 3 | 20216 | 1962 | cog | col | 2925 | 2000.0 | 0.0 | import |
| 4 | 25005 | 1962 | dza | col | 1210 | 0.0 | 112000.0 | import |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 775897 | 124923798 | 2017 | ury | col | 6658 | 8658.0 | NaN | import |
| 775898 | 124923799 | 2017 | ury | col | 8924 | NaN | 4285.5 | import |
| 775899 | 124923800 | 2017 | ury | col | 8928 | 3816.5 | 25667.5 | import |
| 775900 | 124923801 | 2017 | ury | col | 8972 | 500.0 | 5306.0 | import |
| 775901 | 124923802 | 2017 | ury | col | 9310 | 5500539.5 | 10063053.0 | import |
775902 rows × 8 columns
exports['Transaction'] = 'export'
exports
| Unnamed: 0 | year | origin | dest | sitc4 | export_val | import_val | Transaction | |
|---|---|---|---|---|---|---|---|---|
| 0 | 761444 | 1962 | col | civ | 712 | 0.0 | 13000.0 | export |
| 1 | 761445 | 1962 | col | civ | 1210 | 12000.0 | 0.0 | export |
| 2 | 761446 | 1962 | col | cod | 2925 | 0.0 | 2000.0 | export |
| 3 | 761447 | 1962 | col | cog | 2925 | 0.0 | 2000.0 | export |
| 4 | 761448 | 1962 | col | dza | 1210 | 112000.0 | 0.0 | export |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 799401 | 124876488 | 2017 | col | ury | 6658 | NaN | 8658.0 | export |
| 799402 | 124876489 | 2017 | col | ury | 8924 | 4285.5 | NaN | export |
| 799403 | 124876490 | 2017 | col | ury | 8928 | 25667.5 | 3816.5 | export |
| 799404 | 124876491 | 2017 | col | ury | 8972 | 5306.0 | 500.0 | export |
| 799405 | 124876492 | 2017 | col | ury | 9310 | 10063053.0 | 5500539.5 | export |
799406 rows × 8 columns
balance_payments = imports.append(exports)
balance_payments
| Unnamed: 0 | year | origin | dest | sitc4 | export_val | import_val | Transaction | |
|---|---|---|---|---|---|---|---|---|
| 0 | 11564 | 1962 | civ | col | 712 | 13000.0 | 0.0 | import |
| 1 | 11565 | 1962 | civ | col | 1210 | 0.0 | 12000.0 | import |
| 2 | 17741 | 1962 | cod | col | 2925 | 2000.0 | 0.0 | import |
| 3 | 20216 | 1962 | cog | col | 2925 | 2000.0 | 0.0 | import |
| 4 | 25005 | 1962 | dza | col | 1210 | 0.0 | 112000.0 | import |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 799401 | 124876488 | 2017 | col | ury | 6658 | NaN | 8658.0 | export |
| 799402 | 124876489 | 2017 | col | ury | 8924 | 4285.5 | NaN | export |
| 799403 | 124876490 | 2017 | col | ury | 8928 | 25667.5 | 3816.5 | export |
| 799404 | 124876491 | 2017 | col | ury | 8972 | 5306.0 | 500.0 | export |
| 799405 | 124876492 | 2017 | col | ury | 9310 | 10063053.0 | 5500539.5 | export |
1575308 rows × 8 columns
countrys.columns = ['Unnamed: 0','id','origin','name']
balance_payments = balance_payments.merge(countrys[['origin','name']], on='origin', how='left')
balance_payments.columns = ['Unnamed: 0', 'year', 'origin', 'dest', 'sitc4', 'export_val','import_val', 'Transaction', 'name_origin']
countrys.columns = ['Unnamed: 0','id','dest','name']
balance_payments = balance_payments.merge(countrys[['dest','name']], on='dest', how='left')
balance_payments.columns = ['Unnamed: 0', 'year', 'origin', 'dest', 'sitc', 'export_val','import_val', 'Transaction', 'name_origin', 'name_dest']
balance_payments
| Unnamed: 0 | year | origin | dest | sitc | export_val | import_val | Transaction | name_origin | name_dest | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11564 | 1962 | civ | col | 712 | 13000.0 | 0.0 | import | Cote d'Ivoire | Colombia |
| 1 | 11565 | 1962 | civ | col | 1210 | 0.0 | 12000.0 | import | Cote d'Ivoire | Colombia |
| 2 | 17741 | 1962 | cod | col | 2925 | 2000.0 | 0.0 | import | Democratic Republic of the Congo | Colombia |
| 3 | 20216 | 1962 | cog | col | 2925 | 2000.0 | 0.0 | import | Republic of the Congo | Colombia |
| 4 | 25005 | 1962 | dza | col | 1210 | 0.0 | 112000.0 | import | Algeria | Colombia |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1575303 | 124876488 | 2017 | col | ury | 6658 | NaN | 8658.0 | export | Colombia | Uruguay |
| 1575304 | 124876489 | 2017 | col | ury | 8924 | 4285.5 | NaN | export | Colombia | Uruguay |
| 1575305 | 124876490 | 2017 | col | ury | 8928 | 25667.5 | 3816.5 | export | Colombia | Uruguay |
| 1575306 | 124876491 | 2017 | col | ury | 8972 | 5306.0 | 500.0 | export | Colombia | Uruguay |
| 1575307 | 124876492 | 2017 | col | ury | 9310 | 10063053.0 | 5500539.5 | export | Colombia | Uruguay |
1575308 rows × 10 columns
balance_payments = balance_payments.merge(products[['sitc','name']], on='sitc', how='left')
balance_payments.columns = ['Unnamed: 0', 'year', 'origin', 'dest', 'sitc', 'export_val','import_val', 'Transaction', 'name_origin', 'name_dest', 'name_product']
balance_payments.drop(['Unnamed: 0'], axis=1, inplace=True)
balance_payments
| year | origin | dest | sitc | export_val | import_val | Transaction | name_origin | name_dest | name_product | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1962 | civ | col | 712 | 13000.0 | 0.0 | import | Cote d'Ivoire | Colombia | Coffee Extracts |
| 1 | 1962 | civ | col | 1210 | 0.0 | 12000.0 | import | Cote d'Ivoire | Colombia | Tobacco unmanufactured; tobacco refuse |
| 2 | 1962 | cod | col | 2925 | 2000.0 | 0.0 | import | Democratic Republic of the Congo | Colombia | Planting Seeds and Spores |
| 3 | 1962 | cog | col | 2925 | 2000.0 | 0.0 | import | Republic of the Congo | Colombia | Planting Seeds and Spores |
| 4 | 1962 | dza | col | 1210 | 0.0 | 112000.0 | import | Algeria | Colombia | Tobacco unmanufactured; tobacco refuse |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1575303 | 2017 | col | ury | 6658 | NaN | 8658.0 | export | Colombia | Uruguay | Miscellaneous Articles of Glass |
| 1575304 | 2017 | col | ury | 8924 | 4285.5 | NaN | export | Colombia | Uruguay | Cards and Decals |
| 1575305 | 2017 | col | ury | 8928 | 25667.5 | 3816.5 | export | Colombia | Uruguay | Miscellaneous Printed Matter |
| 1575306 | 2017 | col | ury | 8972 | 5306.0 | 500.0 | export | Colombia | Uruguay | Imitation Jewellery |
| 1575307 | 2017 | col | ury | 9310 | 10063053.0 | 5500539.5 | export | Colombia | Uruguay | Unclassified Transactions |
1575308 rows × 10 columns
#@title Geo referentiation
%%time
countries = set(list(balance_payments["name_origin"].unique())+
list(balance_payments["name_dest"].unique()))
longitude = []
latitude = []
place = []
for i in (countries):
place.append(i)
if findGeocode(i) != None:
loc = findGeocode(i)
latitude.append(loc.latitude)
longitude.append(loc.longitude)
else:
latitude.append(np.nan)
longitude.append(np.nan)
location = pd.DataFrame()
location["place"] = place
location["lon"] = longitude
location["lat"] = latitude
location.index = location["place"]
location.to_dict()
CPU times: user 1.75 s, sys: 241 ms, total: 1.99 s Wall time: 4min 1s
balance_payments['lat_origin'] = balance_payments["name_origin"].map(location['lat'])
balance_payments['lon_origin'] = balance_payments["name_origin"].map(location['lon'])
balance_payments['lat_dest'] = balance_payments["name_dest"].map(location['lat'])
balance_payments['lon_dest'] = balance_payments["name_dest"].map(location['lon'])
balance_payments
| year | origin | dest | sitc | export_val | import_val | Transaction | name_origin | name_dest | name_product | lat_origin | lon_origin | lat_dest | lon_dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1962 | civ | col | 712 | 13000.0 | 0.0 | import | Cote d'Ivoire | Colombia | Coffee Extracts | 7.989737 | -5.567946 | 2.889443 | -73.783892 |
| 1 | 1962 | civ | col | 1210 | 0.0 | 12000.0 | import | Cote d'Ivoire | Colombia | Tobacco unmanufactured; tobacco refuse | 7.989737 | -5.567946 | 2.889443 | -73.783892 |
| 2 | 1962 | cod | col | 2925 | 2000.0 | 0.0 | import | Democratic Republic of the Congo | Colombia | Planting Seeds and Spores | -2.981434 | 23.822264 | 2.889443 | -73.783892 |
| 3 | 1962 | cog | col | 2925 | 2000.0 | 0.0 | import | Republic of the Congo | Colombia | Planting Seeds and Spores | -0.726433 | 15.641915 | 2.889443 | -73.783892 |
| 4 | 1962 | dza | col | 1210 | 0.0 | 112000.0 | import | Algeria | Colombia | Tobacco unmanufactured; tobacco refuse | 28.000027 | 2.999983 | 2.889443 | -73.783892 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1575303 | 2017 | col | ury | 6658 | NaN | 8658.0 | export | Colombia | Uruguay | Miscellaneous Articles of Glass | 2.889443 | -73.783892 | -32.875555 | -56.020153 |
| 1575304 | 2017 | col | ury | 8924 | 4285.5 | NaN | export | Colombia | Uruguay | Cards and Decals | 2.889443 | -73.783892 | -32.875555 | -56.020153 |
| 1575305 | 2017 | col | ury | 8928 | 25667.5 | 3816.5 | export | Colombia | Uruguay | Miscellaneous Printed Matter | 2.889443 | -73.783892 | -32.875555 | -56.020153 |
| 1575306 | 2017 | col | ury | 8972 | 5306.0 | 500.0 | export | Colombia | Uruguay | Imitation Jewellery | 2.889443 | -73.783892 | -32.875555 | -56.020153 |
| 1575307 | 2017 | col | ury | 9310 | 10063053.0 | 5500539.5 | export | Colombia | Uruguay | Unclassified Transactions | 2.889443 | -73.783892 | -32.875555 | -56.020153 |
1575308 rows × 14 columns
Finally we can check in one sight the completeness level of all columns
#@title Null Values Sight
sns.set_style('darkgrid')
plt.rcParams.update({'font.size':15})
plt.figure(figsize=(25,10))
sns.heatmap(balance_payments.isnull(), cbar=False)
plt.title("Top Ten Variables")
plt.xlabel('Variables')
plt.ylabel('rows')
Text(199.0, 0.5, 'rows')
From this graphic we can see in the horizontal axis all the columns from the current status of the dataset, and for the vertical axis the rows, filling it you can see if the table cell is filled or is a missing value, with a black or white line respectively
So we can see too few missing values from export and import columns, and origin and destiny name. We will measure them latter
El análisis exploratorio es parte fundamental para responder las preguntas propuestas por el equipo de data scientists de Platzi.
First let's go to explore the basic info about the last seccion results
#@title #####'balance_payments' dataset
balance_payments.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1575308 entries, 0 to 1575307 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 1575308 non-null int64 1 origin 1575308 non-null object 2 dest 1575308 non-null object 3 sitc 1575308 non-null int64 4 export_val 1447814 non-null float64 5 import_val 1475726 non-null float64 6 Transaction 1575308 non-null object 7 name_origin 1574258 non-null object 8 name_dest 1571389 non-null object 9 name_product 1575308 non-null object 10 lat_origin 1572816 non-null float64 11 lon_origin 1572816 non-null float64 12 lat_dest 1572781 non-null float64 13 lon_dest 1572781 non-null float64 dtypes: float64(6), int64(2), object(6) memory usage: 180.3+ MB
From the basic info we can see categorical data like type of transaction, name of the origin and destiny country and labels
We can see also numerical data, like the import and export value transaction, latitude and longitude and the year
#@title #####Store the variables by type to use them easily
features = pd.DataFrame(balance_payments.dtypes)
features.columns = ['type']
features_f = features[features['type'] == 'float64'].index
features_o = features[features['type'] == 'object'].index
features_i = features[features['type'] == 'int64'].index
#@title #####Numerical data
features_num = features_f.append(features_i)
balance_payments.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | 1575308.0 | 1.997857e+03 | 1.417671e+01 | 1962.000000 | 1989.000000 | 2001.000000 | 2009.000000 | 2.017000e+03 |
| sitc | 1575308.0 | 6.222229e+03 | 2.339567e+03 | 10.000000 | 5514.000000 | 6760.000000 | 7723.000000 | 9.710000e+03 |
| export_val | 1447814.0 | 1.074322e+06 | 3.221065e+07 | 0.000000 | 0.000000 | 3308.000000 | 73794.750000 | 1.385914e+10 |
| import_val | 1475726.0 | 1.106083e+06 | 3.493362e+07 | 0.000000 | 0.000000 | 3344.000000 | 73000.000000 | 1.580013e+10 |
| lat_origin | 1572816.0 | 1.435434e+01 | 2.157828e+01 | -41.500083 | 2.889443 | 2.889443 | 32.647531 | 7.761923e+01 |
| lon_origin | 1572816.0 | -4.255188e+01 | 5.870840e+01 | -176.204224 | -73.783892 | -73.783892 | -3.276575 | 1.791583e+02 |
| lat_dest | 1572781.0 | 1.470021e+01 | 2.164618e+01 | -41.500083 | 2.889443 | 2.889443 | 35.000074 | 7.761923e+01 |
| lon_dest | 1572781.0 | -4.116957e+01 | 5.965435e+01 | -176.204224 | -73.783892 | -73.783892 | 1.888334 | 1.791583e+02 |
As it was said the data was given since 1962 to 2017, later we can explore better the money and geographical values
#@title #####Categorical data
balance_payments[features_o].describe(include='all').T
| count | unique | top | freq | |
|---|---|---|---|---|
| origin | 1575308 | 245 | col | 801096 |
| dest | 1575308 | 250 | col | 777592 |
| Transaction | 1575308 | 2 | export | 799406 |
| name_origin | 1574258 | 241 | Colombia | 801096 |
| name_dest | 1571389 | 243 | Colombia | 777592 |
| name_product | 1575308 | 936 | Coffee | 6433 |
We have five destiny's more than origin countries, two type of transactions and 936 different products
#@title ##### Null values
Nulos = 100*balance_payments.isnull().sum()/balance_payments.shape[0]
Nulos = pd.DataFrame(Nulos)
Nulos.columns = ['% Null values']
Nulos.sort_values(by='% Null values', ascending=False, inplace=True)
Nulos
| % Null values | |
|---|---|
| export_val | 8.093274 |
| import_val | 6.321430 |
| name_dest | 0.248777 |
| lat_dest | 0.160413 |
| lon_dest | 0.160413 |
| lat_origin | 0.158191 |
| lon_origin | 0.158191 |
| name_origin | 0.066654 |
| year | 0.000000 |
| origin | 0.000000 |
| dest | 0.000000 |
| sitc | 0.000000 |
| Transaction | 0.000000 |
| name_product | 0.000000 |
As we knew from our previous results, there are to few null data, just 8%, 6% and less than zero% for Export and import values and names respectively. This magnitudes doesn't matter in the future analysis
Let's go to watch the distribution of numerical data
#@title Transaction year histogram
plt.rcParams.update({'font.size': 20})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(25,6))
sns.histplot(balance_payments, x='year', color='y', kde=True)
plt.xlabel('year')
plt.ylabel("count")
plt.title('Transaction year')
Text(0.5, 1.0, 'Transaction year')
As we can see we have more transaction from 2017 than the pder years, so we hope to fine more transaction value about this year
#@title Transaction Export value by year
plt.rcParams.update({'font.size': 20})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(25,6))
sns.barplot(x="year", y="export_val", hue="Transaction", data=balance_payments,palette='icefire')
plt.xticks(rotation=90)
plt.xlabel('year')
plt.ylabel("Value")
plt.title('Export Value by year')
Text(0.5, 1.0, 'Export Value by year')
In this graphic we can see the evolution of the export and import value from all the export records, it tends to be very simmilar and have his highest value on 2015 and the lower values are from Sixties
The highest value is from 2015 despite we have more transactions on 2007 as we saw before
#@title Transaction import value by year
plt.rcParams.update({'font.size': 20})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(25,6))
sns.barplot(x="year", y="import_val", hue="Transaction", data=balance_payments,palette='YlOrBr')
plt.xticks(rotation=90)
plt.xlabel('year')
plt.ylabel("Value")
plt.title('Import Value by year')
Text(0.5, 1.0, 'Import Value by year')
In this graphic we can see the evolution of the export and import value from all the export records, it tends to be very simmilar and have his highest value on 2015 and the lower values are from Sixties
Is very simmilar to the previous one
Let's go to watch the visualization of the quantity (count) of the categorical data
#@title Number of transactions
pd.DataFrame(balance_payments['Transaction'].value_counts())
| Transaction | |
|---|---|
| export | 799406 |
| import | 775902 |
#@title Transaction type participation
plt.style.use("Solarize_Light2")
fig, ax = plt.subplots(1,1,figsize=(15,5))
a = pd.DataFrame(balance_payments['Transaction'].value_counts())
ax.pie(np.array(list(a['Transaction'])),
labels=np.array(list(a.index)),
autopct='%1.1f%%',
shadow=False,
)
ax.set(title='Transactions')
[Text(0.5, 1.0, 'Transactions')]
There are 0.7% more records from export data
#@title Top 20 Number of transaction of products
plt.rcParams.update({'font.size': 20})
sns.set_style('darkgrid')
plt.style.use("dark_background")
var_hist = features_o
plt.figure(figsize=(10,10))
a = pd.DataFrame(balance_payments['name_product'].value_counts()).head(20)
y = np.array(list(a.index))
x = np.array(list(a['name_product']))
sns.barplot(x=x, y=y, palette="rocket")
plt.xlabel("count")
plt.title('Transation by Product')
Text(0.5, 1.0, 'Transation by Product')
The Coffe is the product with more transactions
#@title Top 20 Number of transaction of products by country
sns.set_style('darkgrid')
plt.style.use("dark_background")
var_hist = features_o
plt.figure(figsize=(10,10))
a = pd.DataFrame(balance_payments[balance_payments['Transaction'] == 'import']['name_origin'].value_counts()).head(20)
y = np.array(list(a.index))
x = np.array(list(a['name_origin']))
sns.barplot(x=x, y=y, palette="coolwarm")
plt.xlabel("count")
plt.title('Origin country')
Text(0.5, 1.0, 'Origin country')
#@title Top 20 Number of transaction of products by country
sns.set_style('darkgrid')
plt.style.use("dark_background")
var_hist = features_o
plt.figure(figsize=(10,10))
a = pd.DataFrame(balance_payments[balance_payments['Transaction'] == 'export']['name_dest'].value_counts()).head(20)
y = np.array(list(a.index))
x = np.array(list(a['name_dest']))
plt.xlabel("count")
sns.barplot(x=x, y=y, palette="Spectral")
plt.title('Destiny country')
Text(0.5, 1.0, 'Destiny country')
United States and Germany are the countries with more transactions in imports and exports
Now we're going to explore the value of export and import with all the categorical variables but we are going to do this just with the value of Colombian Transactions
#@title Colombian Transactions dataset
tmp = balance_payments[balance_payments['Transaction']=='import'][['year', 'export_val', 'Transaction', 'name_origin', 'name_dest', 'name_product']]
tmp.columns = ['year', 'Value', 'Transaction', 'name_origin', 'name_dest', 'name_product']
colombian_Transactions = balance_payments[balance_payments['Transaction']=='export'][['year', 'export_val', 'Transaction', 'name_origin', 'name_dest', 'name_product']]
colombian_Transactions.columns = ['year', 'Value', 'Transaction', 'name_origin', 'name_dest', 'name_product']
colombian_Transactions = colombian_Transactions.append(tmp)
colombian_Transactions
| year | Value | Transaction | name_origin | name_dest | name_product | |
|---|---|---|---|---|---|---|
| 775902 | 1962 | 0.0 | export | Colombia | Cote d'Ivoire | Coffee Extracts |
| 775903 | 1962 | 12000.0 | export | Colombia | Cote d'Ivoire | Tobacco unmanufactured; tobacco refuse |
| 775904 | 1962 | 0.0 | export | Colombia | Democratic Republic of the Congo | Planting Seeds and Spores |
| 775905 | 1962 | 0.0 | export | Colombia | Republic of the Congo | Planting Seeds and Spores |
| 775906 | 1962 | 112000.0 | export | Colombia | Algeria | Tobacco unmanufactured; tobacco refuse |
| ... | ... | ... | ... | ... | ... | ... |
| 775897 | 2017 | 8658.0 | import | Uruguay | Colombia | Miscellaneous Articles of Glass |
| 775898 | 2017 | NaN | import | Uruguay | Colombia | Cards and Decals |
| 775899 | 2017 | 3816.5 | import | Uruguay | Colombia | Miscellaneous Printed Matter |
| 775900 | 2017 | 500.0 | import | Uruguay | Colombia | Imitation Jewellery |
| 775901 | 2017 | 5500539.5 | import | Uruguay | Colombia | Unclassified Transactions |
1575308 rows × 6 columns
#@title Export and imports Value participation
plt.rcParams.update({'font.size': 15})
plt.style.use("dark_background")
fig, ax = plt.subplots(1,1,figsize=(15,5))
a = colombian_Transactions.groupby(['Transaction']).sum()[['Value']]
ax.pie(np.array(list(a['Value'])),
labels=np.array(list(a.index)),
autopct='%1.1f%%',
shadow=False,
colors = ['b','y']
)
plt.title('Monetary Value participation')
Text(0.5, 1.0, 'Monetary Value participation')
As we can see historically Colombia have more exports than imports, and that is a good balance signal
#@title Top 10 products of higher import value
plt.rcParams.update({'font.size': 15})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(7,5))
a = colombian_Transactions[balance_payments['Transaction']=='import']
a = a.groupby(['name_product']).sum()[['Value']].sort_values(by='Value', ascending=False).head(10)
top_ten_import_products = a.index[:10]
y = np.array(list(a.index))
x = np.array(list(a['Value']))
sns.barplot(x=x, y=y, palette="dark:salmon_r")
plt.xlabel("count")
plt.title('Top Product by Import value')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Text(0.5, 1.0, 'Top Product by Import value')
As we can see historically Colombia imports Cars and many others unclasificated stuffs
#@title Top 10 products of higher Export value
plt.rcParams.update({'font.size': 15})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(7,5))
a = colombian_Transactions[balance_payments['Transaction']=='export']
a = a.groupby(['name_product']).sum()[['Value']].sort_values(by='Value', ascending=False).head(10)
top_ten_export_products = a.index[:10]
y = np.array(list(a.index))
x = np.array(list(a['Value']))
sns.barplot(x=x, y=y, palette="light:b")
plt.xlabel("count")
plt.title('Top Product by Export value')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Text(0.5, 1.0, 'Top Product by Export value')
As we can see we have the typical exported products, like Crude Petroleum, Coffe, coal, Bananas and flowers
#@title Top 10 Countries of higher import value
plt.rcParams.update({'font.size': 15})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(7,5))
a = colombian_Transactions[balance_payments['Transaction']=='import']
a = a.groupby(['name_origin']).sum()[['Value']].sort_values(by='Value', ascending=False).head(10)
top_ten_import_countries = a.index[:10]
y = np.array(list(a.index))
x = np.array(list(a['Value']))
sns.barplot(x=x, y=y, palette="cubehelix")
plt.xlabel("count")
plt.title('Top Countries by Import value')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Text(0.5, 1.0, 'Top Countries by Import value')
United States, China and Mexico are the countries with the highest Import value along the history
#@title Top 10 Countries of higher Export value
plt.rcParams.update({'font.size': 15})
sns.set_style('darkgrid')
plt.style.use("dark_background")
plt.figure(figsize=(7,5))
a = colombian_Transactions[balance_payments['Transaction']=='export']
a = a.groupby(['name_dest']).sum()[['Value']].sort_values(by='Value', ascending=False).head(10)
top_ten_export_countries = a.index[:10]
y = np.array(list(a.index))
x = np.array(list(a['Value']))
sns.barplot(x=x, y=y, palette="Accent")
plt.xlabel("count")
plt.title('Top Countries by Export value')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Text(0.5, 1.0, 'Top Countries by Export value')
United States and Venezuela are the countries with the highest Export value along the history
Let's go to watch the visualization of geografical data
#@title #21st Century Exports Destiny
df = balance_payments[balance_payments['Transaction']=='export']
df = df[df['year']>2000]
df = df.dropna()
max_amount = float(df['export_val'].max())
folium_hmap = folium.Map(location=[0, 0],
zoom_start=3,
tiles="OpenStreetMap")
hm_wide = HeatMap( list(zip(df['lat_dest'], df['lon_dest'], df['export_val'])),
min_opacity=0.2,
max_val=1000,
radius=8, blur=6,
max_zoom=15,
)
folium_hmap.add_child(hm_wide)